rm(list=ls())
library(car)
library(ggplot2)
library(ggrepel)
library(reshape2)
library(ggpubr)
library(dplyr)
library(limma)
library(MASS)
knitr::opts_chunk$set(echo = TRUE)
options(width = 60)
plot_opts = theme_bw()+
theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black") , axis.title.x = element_text(face='bold', size = 14), axis.title.y = element_text(face='bold', size = 14), plot.title = element_text(face='bold', size = 18, hjust=0.5))
plotdists = function(df, path){
plotlist = list()
for (col in colnames(df)){
x = df[,col]
if (class(x)=='factor'){
dfnew = data.frame(col = factor(), count = integer())
for (level in levels(x)){
count = length(which(x==level))
dfnew = rbind(data.frame(col= level, count = count), dfnew)
}
dfnew$col <- factor(dfnew$col, levels = dfnew$col[order(dfnew$count)])
p = ggplot(dfnew, aes(x=col, y=count))+
geom_bar(stat= 'identity')+
plot_opts+
labs(x = col, y = 'Count', title = paste(col, "Distribution"))+
geom_text(aes(label = count), vjust = -0.3)
plotlist[[col]] = p
}else if (class(x) == 'numeric' | class(x) == 'integer'){
dfnew =data.frame(col = class(x))
histinfo = hist(x = x , breaks='Scott', plot = F)
p = ggplot(as.data.frame(x), aes(x=x))+
geom_histogram(bins = length(histinfo$breaks))+plot_opts+
#geom_density(aes(y=..count..), size = 2)+plot_opts+
geom_vline(aes(xintercept = median(x)),
linetype = "dashed", size = 0.6)+
labs(x = col, y = 'Count', title = paste(col, 'Distribution'))
plotlist[[col]] = p
}
}
pfinal =ggarrange(plotlist = plotlist)
ggsave(path, pfinal, height=2.5*length(plotlist) , width=2.5*length(plotlist), units="in", limitsize = FALSE, dpi=300)
return(pfinal)
}
Assumption_Check = function(l_m, outp){
df = l_m[['model']]
df$residuals = l_m[["residuals"]]
df = df[,-which(colnames(df)==l_m[["call"]][["formula"]][[2]])]
#Assumption 1 - linearity check
a1pls = list()#assumption 1 plotlist
for (col in colnames(df)[-which(colnames(df)=='residuals')]){
dfnew = data.frame(x = df[,col], y = df$residuals)
a1pls[[col]] = ggplot(data = dfnew, aes(x =x, y=y))+
geom_jitter()+plot_opts+labs(x = col, y = 'Residuals')+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
geom_hline(yintercept=0,linetype="dashed",color = "red", size=1)
}
p1 = ggarrange(plotlist= a1pls)
p1 = annotate_figure(p1,text_grob("Linearity Checks", color = "red", face = "bold", size = 20))
ggsave(filename = paste(outp,'linearity.png',sep=""), plot = p1, dpi = 600)
#Assumption 2 - Bias and Scedasticity
df2 = data.frame(Fit = l_m$fitted.values, Residuals = l_m$residuals)
p2 = ggplot(data = df2, aes(x = Fit, y = Residuals))+
geom_point()+plot_opts+geom_hline(yintercept=0,linetype="dashed",color = "red", size=1)+labs(title = 'Bias and Scedasticity Check')+theme(plot.title = element_text(color = 'red'))
ggsave(filename = paste(outp,'bias_sced.png',sep=""), plot = p2, dpi = 600)
#Assumption 3 - Correlation in Errors
a3pls = list()
for (n in colnames(df[,-which(colnames(df)=='residuals')])){
dfnew = data.frame(y = df$residuals[order(df[,n])], x = 1:nrow(df))
a3pls[[n]] = ggplot(data = dfnew, aes(x = x, y = y))+geom_jitter()+
plot_opts+labs(title = paste('Sorted By:', n), x = 'Index', y='Residuals')
}
p3 = ggarrange(plotlist = a3pls)
p3 = annotate_figure(p3,text_grob("Error Independence Check", color = "red", face = "bold", size = 20))
ggsave(filename = paste(outp,'inderror.png',sep=""), plot = p3, dpi = 600)
#Assumption 4 - Normality of Residuals
shapres = shapiro.test(l_m$residuals)
p4 = ggplot(df)+
geom_qq(aes(sample = residuals))+geom_qq_line(aes(sample= residuals))+
plot_opts+ labs(title = paste('Normality of Residuals\n', 'Shaprio Wilks Results: W = ', as.character(round(shapres$statistic,3)), ', p = ', as.character(round(shapres$p.value,5))), x = 'Theoretical Values', y = 'Sample Values')+ theme(plot.title = element_text(color = 'red'))
ggsave(filename = paste(outp,'normres.png',sep=""), plot = p4, dpi = 600)
pfinal = ggarrange(plotlist = list(p1, p2, p3, p4))
pfinal= annotate_figure(pfinal,text_grob("Model Assumption Check", face = "bold", size = 26))
ggsave(filename = paste(outp,'all_assum.png',sep=""), plot = pfinal, dpi = 600, width = 8, height = 10, units = 'in')
return(pfinal)
}
infl_analysis = function(l_m, df){
k = length(l_m$coefficients)-1
n = nrow(df)
row_num = 1:n
#response_v = df[colnames(df)==l_m$terms[[2]]] #use if you would like to change the labels of the points to the response variable rather than observation number
#Leverage points
hatdf = data.frame(Values = hatvalues(l_m), Row_Num = row_num, Type = rep('Hat Values', length(row_num)), Point_Type = rep('Leverage', length(row_num)), Bound1 = 2*(k+1)/n, Bound2 = 2*(k+1)/n)
hatdf$Label = NA
inds = which(hatvalues(l_m)>2*(k+1)/n)
if(length(inds)!= 0){hatdf$Label[inds] = row_num[inds]}
#Outliers
instdf = data.frame(Values = rstandard(l_m), Row_Num = row_num, Type = rep('Internally Standardized Residuals', length(row_num)), Point_Type = rep('Outlier', length(row_num)), Bound1 = 3, Bound2 = -3)
instdf$Label = NA
inds = which(rstandard(l_m) > 3 | rstandard(l_m) < -3)
if(length(inds)!=0){instdf$Label[inds] = row_num[inds]}
extdf = data.frame(Values = rstudent(l_m), Row_Num = row_num, Type = rep('Externally Standardized Residuals', length(row_num)), Point_Type = rep('Outlier', length(row_num)), Bound1 = 3, Bound2 = -3)
extdf$Label = NA
inds = which(rstudent(l_m) > 3 | rstudent(l_m) < -3)
if(length(inds)!=0){extdf$Label[inds] = row_num[inds]}
#Influential
dfitsdf = data.frame(Values = dffits(l_m), Row_Num = row_num, Type = rep('DEFFITS', length(row_num)),Point_Type = rep('Influential', length(row_num)), Bound1 = 2*sqrt((k+2)/(n-k-2)), Bound2 = -2*sqrt((k+2)/(n-k-2)))
dfitsdf$Label = NA
inds = which(dffits(l_m) > 2*sqrt((k+2)/(n-k-2)) | dffits(l_m) < -2*sqrt((k+2)/(n-k-2)))
if(length(inds)!=0){dfitsdf$Label[inds] = row_num[inds]}
cddf = data.frame(Values = cooks.distance(l_m), Row_Num = row_num, Type = rep("Cook's Distance", length(row_num)),Point_Type = rep('Influential', length(row_num)), Bound1 = 1, Bound2 = 1)
cddf$Label = NA
inds = cooks.distance(l_m) > 1
if(length(inds)!=0){cddf$Label[inds] = row_num[inds]}
cvdf = data.frame(Values = covratio(l_m), Row_Num = row_num, Type = rep("Covariance Ratio", length(row_num)),Point_Type = rep('Influential', length(row_num)), Bound1 = 1 + 3*(k+1)/n, Bound2 = 1 - 3*(k+1)/n)
cvdf$Label = NA
inds = covratio(l_m) > 1 + 3*(k+1)/n | covratio(l_m) < 1 - 3*(k+1)/n
if(length(inds)!=0){cvdf$Label[inds] = row_num[inds]}
ret_df = rbind(hatdf, instdf, extdf, dfitsdf, cddf, cvdf)
return(ret_df)
}
df = read.csv("../Data/master.csv", stringsAsFactors = T)
str(df)
## 'data.frame': 27820 obs. of 12 variables:
## $ country : Factor w/ 101 levels "Albania","Antigua and Barbuda",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1987 1987 1987 1987 1987 1987 1987 1987 1987 1987 ...
## $ sex : Factor w/ 2 levels "female","male": 2 2 1 2 2 1 1 1 2 1 ...
## $ age : Factor w/ 6 levels "15-24 years",..: 1 3 1 6 2 6 3 2 5 4 ...
## $ suicides_no : int 21 16 14 1 9 1 6 4 1 0 ...
## $ population : int 312900 308000 289700 21800 274300 35600 278800 257200 137500 311000 ...
## $ suicides.100k.pop : num 6.71 5.19 4.83 4.59 3.28 2.81 2.15 1.56 0.73 0 ...
## $ country.year : Factor w/ 2321 levels "Albania1987",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ HDI.for.year : num NA NA NA NA NA NA NA NA NA NA ...
## $ gdp_for_year.... : Factor w/ 2321 levels "1,002,219,052,968",..: 727 727 727 727 727 727 727 727 727 727 ...
## $ gdp_per_capita....: int 796 796 796 796 796 796 796 796 796 796 ...
## $ generation : Factor w/ 6 levels "Boomers","G.I. Generation",..: 3 6 3 2 1 2 6 1 2 3 ...
Below will examine the na values in each column:
for (col in colnames(df)){
print(length(which(is.na(df[,col]))))
}
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 19456
## [1] 0
## [1] 0
## [1] 0
It appears that the HDI for year column has the majority of NAs and will be discluded from the remaining of the analysis. The data will be filtered for the United States only.
## 'data.frame': 372 obs. of 6 variables:
## $ year : int 1985 1985 1985 1985 1985 1985 1985 1985 1985 1985 ...
## $ sex : Factor w/ 2 levels "female","male": 2 2 2 2 2 1 1 1 1 1 ...
## $ age : Factor w/ 6 levels "15-24 years",..: 6 5 2 3 1 3 5 6 2 1 ...
## $ suicides.100k.pop : num 53.6 29.5 24.5 22.8 21.4 ...
## $ gdp_for_year.... : num 4.35e+12 4.35e+12 4.35e+12 4.35e+12 4.35e+12 ...
## $ gdp_per_capita....: int 19693 19693 19693 19693 19693 19693 19693 19693 19693 19693 ...
colnames(df) = c('Sex', 'Age', 'Suicide_Rate', 'GDP_per_Capita')
df$Age <- relevel(df$Age, ref = '5-14 years')
df$Sex = factor(paste(toupper(strsplit2(df$Sex, split ="")[,1])))
path = "../Plots/distplots.png"
plotdists(df, path)
mlr2 = lm(Suicide_Rate ~ Sex*Age + GDP_per_Capita, data = df)
summary(mlr2)
##
## Call:
## lm(formula = Suicide_Rate ~ Sex * Age + GDP_per_Capita, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.2498 -1.0883 -0.2779 0.9381 13.2023
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.989e+00 6.393e-01 4.675 4.16e-06
## SexM 6.410e-01 6.653e-01 0.963 0.336
## Age15-24 years 3.361e+00 6.653e-01 5.052 6.96e-07
## Age25-34 years 4.889e+00 6.653e-01 7.349 1.36e-12
## Age35-54 years 7.200e+00 6.653e-01 10.822 < 2e-16
## Age55-74 years 5.767e+00 6.653e-01 8.669 < 2e-16
## Age75+ years 4.363e+00 6.653e-01 6.558 1.90e-10
## GDP_per_Capita -6.560e-05 1.102e-05 -5.950 6.37e-09
## SexM:Age15-24 years 1.450e+01 9.409e-01 15.414 < 2e-16
## SexM:Age25-34 years 1.710e+01 9.409e-01 18.174 < 2e-16
## SexM:Age35-54 years 1.621e+01 9.409e-01 17.234 < 2e-16
## SexM:Age55-74 years 1.889e+01 9.409e-01 20.072 < 2e-16
## SexM:Age75+ years 3.917e+01 9.409e-01 41.636 < 2e-16
##
## (Intercept) ***
## SexM
## Age15-24 years ***
## Age25-34 years ***
## Age35-54 years ***
## Age55-74 years ***
## Age75+ years ***
## GDP_per_Capita ***
## SexM:Age15-24 years ***
## SexM:Age25-34 years ***
## SexM:Age35-54 years ***
## SexM:Age55-74 years ***
## SexM:Age75+ years ***
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.619 on 359 degrees of freedom
## Multiple R-squared: 0.9621, Adjusted R-squared: 0.9608
## F-statistic: 758.9 on 12 and 359 DF, p-value: < 2.2e-16
outp = "../Plots/"
p = ggplot(data = df, aes(x = GDP_per_Capita, y = Suicide_Rate))+
geom_point(aes(color = Age), alpha = 0.5)+plot_opts+facet_wrap(~Sex)+
labs(x = 'GDP per Capita', y = 'Suicide Number / 100k Population', title = 'Suicide Rate in US from 1985 - 2016')
p
ggsave(filename = paste(outp,'scatter.png',sep=""), plot = p, dpi = 600, width = 8, height = 4, units = 'in')
pf = Assumption_Check(mlr2, outp)
## Saving 12 x 12 in image
## Saving 12 x 12 in image
## Saving 12 x 12 in image
## Saving 12 x 12 in image
pf
ret_df = infl_analysis(mlr2, df =df)
ret_df = cbind(ret_df, df)
p = ggplot(data= ret_df, aes(x= Row_Num, y = Values))+
geom_point(aes(color = Age, shape = Sex))+
facet_wrap(~Type, scales = "free_y")+plot_opts+geom_line(aes(y=Bound1))+geom_line(aes(y=Bound2))+
geom_label_repel(aes(label=Label))+
labs(title = 'Influential Point Analysis', x = 'Observation Number')
p
ggsave(filename = paste(outp,'influential.png',sep=""), plot = p, dpi = 600, width = 12, height = 6, units = 'in')
bc = boxcox(mlr2, data = df)
p = bc$x[which.max(bc$y)]
df$Suicide_Rate = df$Suicide_Rate**p
##
## Call:
## lm(formula = Suicide_Rate ~ Sex * Age + GDP_per_Capita, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.062511 -0.021512 -0.002907 0.018208 0.098173
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.987e-01 6.750e-03 133.140 < 2e-16
## SexM 1.274e-01 7.024e-03 18.134 < 2e-16
## Age15-24 years 3.267e-01 7.024e-03 46.515 < 2e-16
## Age25-34 years 3.868e-01 7.024e-03 55.068 < 2e-16
## Age35-54 years 4.530e-01 7.024e-03 64.494 < 2e-16
## Age55-74 years 4.133e-01 7.024e-03 58.842 < 2e-16
## Age75+ years 3.656e-01 7.024e-03 52.046 < 2e-16
## GDP_per_Capita -5.207e-07 1.164e-07 -4.473 1.04e-05
## SexM:Age15-24 years 1.815e-01 9.934e-03 18.271 < 2e-16
## SexM:Age25-34 years 1.652e-01 9.934e-03 16.633 < 2e-16
## SexM:Age35-54 years 1.125e-01 9.934e-03 11.324 < 2e-16
## SexM:Age55-74 years 1.627e-01 9.934e-03 16.377 < 2e-16
## SexM:Age75+ years 3.364e-01 9.934e-03 33.868 < 2e-16
##
## (Intercept) ***
## SexM ***
## Age15-24 years ***
## Age25-34 years ***
## Age35-54 years ***
## Age55-74 years ***
## Age75+ years ***
## GDP_per_Capita ***
## SexM:Age15-24 years ***
## SexM:Age25-34 years ***
## SexM:Age35-54 years ***
## SexM:Age55-74 years ***
## SexM:Age75+ years ***
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02766 on 359 degrees of freedom
## Multiple R-squared: 0.9872, Adjusted R-squared: 0.9868
## F-statistic: 2314 on 12 and 359 DF, p-value: < 2.2e-16
outp = "../Plots/trans_"
pf = Assumption_Check(mlr3, outp)
## Saving 12 x 12 in image
## Saving 12 x 12 in image
## Saving 12 x 12 in image
## Saving 12 x 12 in image
pf
ret_df = infl_analysis(mlr3, df =df)
ret_df = cbind(ret_df, df)
p = ggplot(data= ret_df, aes(x= Row_Num, y = Values))+
geom_point(aes(color = Age, shape = Sex))+
facet_wrap(~Type, scales = "free_y")+plot_opts+geom_line(aes(y=Bound1))+geom_line(aes(y=Bound2))+
geom_label_repel(aes(label=Label))+
labs(title = 'Influential Point Analysis', x = 'Observation Number')
p
ggsave(filename = paste(outp,'influential.png',sep=""), plot = p, dpi = 600, width = 12, height = 6, units = 'in')
p = ggplot(data = df, aes(x = GDP_per_Capita, y = Suicide_Rate))+
geom_point(aes(color = Age), alpha = 0.5)+plot_opts+facet_wrap(~Sex)+
labs(x = 'GDP per Capita', y = '[Suicide Number / 100k Population]^0.141', title = 'Suicide Rate in US from 1985 - 2016')
p
ggsave(filename = paste(outp,'scatter.png',sep=""), plot = p, dpi = 600, width = 10, height = 5, units = 'in')